A Quick Demo of Parallel Processing in R

R will run in parallel. A number of packages help with this. Some require “forking” which Windows will not support.

  p_load(parallel)
  p_load(MASS)
  starts <- rep(100,4)
  fx <- function(nstart) kmeans(Boston, 4, nstart=nstart)
  numCores <- detectCores()
  numCores

## [1] 16

  microbenchmark(lapply(starts, fx))

## Unit: milliseconds
##                expr     min      lq    mean   median       uq     max neval
##  lapply(starts, fx) 61.9797 68.4986 72.4378 72.72365 76.56555 87.6348   100

  microbenchmark(lapply(starts, fx), mclapply(starts, fx, mc.cores=numCores))

## Error in mclapply(starts, fx, mc.cores = numCores): 'mc.cores' > 1 is not supported on Windows

  microbenchmark(results1 <- lapply(starts, fx),
                 results2 <- mclapply(starts, fx, mc.cores=numCores)
                )

## Error in mclapply(starts, fx, mc.cores = numCores): 'mc.cores' > 1 is not supported on Windows

  microbenchmark(lapply(starts, fx), mclapply(starts, fx))

## Unit: milliseconds
##                  expr     min       lq     mean   median       uq      max
##    lapply(starts, fx) 60.9104 64.69205 68.09924 65.56640 67.60510 152.7054
##  mclapply(starts, fx) 61.7964 64.67095 66.80696 65.71785 68.90565  76.5378
##  neval cld
##    100   a
##    100   a

  microbenchmark(results1 <- lapply(starts, fx), 
                 results2 <- mclapply(starts, fx)
                )

## Unit: milliseconds
##                              expr     min       lq     mean  median      uq
##    results1 <- lapply(starts, fx) 61.8234 64.20815 65.32908 65.1188 65.9042
##  results2 <- mclapply(starts, fx) 59.7178 64.34565 66.41706 65.0911 66.1897
##       max neval cld
##   74.3293   100   a
##  144.0106   100   a

We can use the doParallel package with foreach to improve looping.

First some setup.

  p_load(foreach)
  p_load(doParallel)
  registerDoParallel(2)
  getDoParWorkers()

## [1] 2

  getDoParName()

## [1] "doParallelSNOW"

  getDoParVersion()

## [1] "1.0.17"

Now carry out some simple calculations to show how to use parallel processing.

  microbenchmark(
    for (i in 1:10){
      sqrt(i)
    },
    foreach(i=1:10) %do% {
      sqrt(i)
    },
    foreach (i=1:10) %dopar% {
      sqrt(i)
    },
    foreach (i=1:10, .combine=c) %dopar% {
      sqrt(i)
    }
  )

## Unit: microseconds
##                                                     expr    min      lq
##                          for (i in 1:10) {     sqrt(i) }  831.6  916.50
##                   foreach(i = 1:10) %do% {     sqrt(i) } 1882.1 1983.10
##                foreach(i = 1:10) %dopar% {     sqrt(i) } 3087.3 3468.10
##  foreach(i = 1:10, .combine = c) %dopar% {     sqrt(i) } 3146.5 3489.35
##      mean  median      uq     max neval cld
##   989.478  949.75 1010.45  1545.7   100 a  
##  2166.570 2036.55 2210.00  4726.9   100  b 
##  3843.430 3629.45 4010.95  8234.5   100   c
##  3997.699 3666.30 4028.05 22876.0   100   c

Note that the use of parallel processing in this case is not helping.

Try some bootstrapping to check performance on a “real” problem.

 x <- iris[which(iris[,5] != "setosa"), c(1,5)] 
 trials <- 10000 
 
 microbenchmark( 
    ### Parallel (%dopar%)
    bs1 <- foreach(icount(trials), .combine=cbind) %dopar% { 
                  ind <- sample(100, 100, replace=TRUE) 
                  result1 <- glm(x[ind,2]~x[ind,1], family=binomial(logit)) 
                  coefficients(result1) 
    },
    ### Not parallel (%do%)
    bs2 <- foreach(icount(trials), .combine=cbind) %do% { 
                  ind <- sample(100, 100, replace=TRUE) 
                  result1 <- glm(x[ind,2]~x[ind,1], family=binomial(logit)) 
                  coefficients(result1) 
                  } 
 )

## Unit: seconds
##                                                                                                                                                                                                    expr
##  bs1 <- foreach(icount(trials), .combine = cbind) %dopar% {     ind <- sample(100, 100, replace = TRUE)     result1 <- glm(x[ind, 2] ~ x[ind, 1], family = binomial(logit))     coefficients(result1) }
##     bs2 <- foreach(icount(trials), .combine = cbind) %do% {     ind <- sample(100, 100, replace = TRUE)     result1 <- glm(x[ind, 2] ~ x[ind, 1], family = binomial(logit))     coefficients(result1) }
##        min        lq      mean    median        uq      max neval cld
##   8.597918  8.713557  8.835058  8.757424  8.937364 10.64344   100  a 
##  11.985286 12.108203 12.316349 12.264483 12.420281 14.46708   100   b

 ### Look at the bootstrap since we ran it
 bs1 <- t(bs1)
 bs2 <- t(bs2)
 
 apply(bs1,2,mean)

## (Intercept)   x[ind, 1] 
##  -13.137360    2.104112

 apply(bs1,2,var)

## (Intercept)   x[ind, 1] 
##   9.6451353   0.2473618

 apply(bs1,2,hist)

## $`(Intercept)`
## $breaks
##  [1] -32 -30 -28 -26 -24 -22 -20 -18 -16 -14 -12 -10  -8  -6  -4  -2
## 
## $counts
##  [1]    1    3   10   22   51  159  428  976 1932 2602 2339 1171  283   22    1
## 
## $density
##  [1] 0.00005 0.00015 0.00050 0.00110 0.00255 0.00795 0.02140 0.04880 0.09660
## [10] 0.13010 0.11695 0.05855 0.01415 0.00110 0.00005
## 
## $mids
##  [1] -31 -29 -27 -25 -23 -21 -19 -17 -15 -13 -11  -9  -7  -5  -3
## 
## $xname
## [1] "newX[, i]"
## 
## $equidist
## [1] TRUE
## 
## attr(,"class")
## [1] "histogram"
## 
## $`x[ind, 1]`
## $breaks
##  [1] 0.5 1.0 1.5 2.0 2.5 3.0 3.5 4.0 4.5 5.0 5.5
## 
## $counts
##  [1]   28  907 3570 3532 1489  377   74   19    3    1
## 
## $density
##  [1] 0.0056 0.1814 0.7140 0.7064 0.2978 0.0754 0.0148 0.0038 0.0006 0.0002
## 
## $mids
##  [1] 0.75 1.25 1.75 2.25 2.75 3.25 3.75 4.25 4.75 5.25
## 
## $xname
## [1] "newX[, i]"
## 
## $equidist
## [1] TRUE
## 
## attr(,"class")
## [1] "histogram"

 apply(bs2,2,mean)

## (Intercept)   x[ind, 1] 
##  -13.166303    2.109399

 apply(bs2,2,var)

## (Intercept)   x[ind, 1] 
##   9.9107502   0.2536903

 apply(bs2,2,hist)

## $`(Intercept)`
## $breaks
##  [1] -34 -32 -30 -28 -26 -24 -22 -20 -18 -16 -14 -12 -10  -8  -6  -4  -2
## 
## $counts
##  [1]    1    1    1    3   15   72  188  454  970 1934 2549 2311 1212  257   31
## [16]    1
## 
## $density
##  [1] 0.00005 0.00005 0.00005 0.00015 0.00075 0.00360 0.00940 0.02270 0.04850
## [10] 0.09670 0.12745 0.11555 0.06060 0.01285 0.00155 0.00005
## 
## $mids
##  [1] -33 -31 -29 -27 -25 -23 -21 -19 -17 -15 -13 -11  -9  -7  -5  -3
## 
## $xname
## [1] "newX[, i]"
## 
## $equidist
## [1] TRUE
## 
## attr(,"class")
## [1] "histogram"
## 
## $`x[ind, 1]`
## $breaks
##  [1] 0.5 1.0 1.5 2.0 2.5 3.0 3.5 4.0 4.5 5.0 5.5
## 
## $counts
##  [1]   44  901 3539 3510 1482  420   89   12    1    2
## 
## $density
##  [1] 0.0088 0.1802 0.7078 0.7020 0.2964 0.0840 0.0178 0.0024 0.0002 0.0004
## 
## $mids
##  [1] 0.75 1.25 1.75 2.25 2.75 3.25 3.75 4.25 4.75 5.25
## 
## $xname
## [1] "newX[, i]"
## 
## $equidist
## [1] TRUE
## 
## attr(,"class")
## [1] "histogram"

A Quick Demo of Parallel Processing in R

Oliver de Pug